Run this code chunk to install the new packages that we’ll be using today:
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("gdsfmt")
BiocManager::install("SNPRelate")
Then, load these packages:
library(gdsfmt)
library(SNPRelate)
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyr)
# function to simulate data at two SNPs
sim_data_onevar <- function(pop, maf){
## pop = vector keeping track of which population each person belongs to
## maf = vector with the minor allele frequency in each population
# simulate SNP with a frequency depending on which population you're in
snp <- rbinom(n = length(pop), size = 2, p = maf[pop])
return(snp)
}
# create dataset with three sets of SNPs
popidx <- rep(1:2, each = 500)
maf1 <- c(0, 1)
maf2 <- c(0.3, 0.5)
maf3 <- c(0.1, 0.1)
set.seed(494)
snps1 <- replicate(3, sim_data_onevar(pop = popidx, maf = maf1))
snps2 <- replicate(3, sim_data_onevar(pop = popidx, maf = maf2))
snps3 <- replicate(10, sim_data_onevar(pop = popidx, maf = maf3))
dat <- cbind(popidx, snps1, snps2, snps3) %>%
as.data.frame()
names(dat) <- c('population', paste0('SNP', 1:(ncol(dat)-1)))
head(dat)
## population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9 SNP10 SNP11 SNP12
## 1 1 0 0 0 0 2 0 0 0 0 0 1 0
## 2 1 0 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 0 1 0 1 0 0
## 4 1 0 0 0 0 1 0 0 0 1 0 0 0
## 5 1 0 0 0 0 0 0 1 1 1 0 0 0
## 6 1 0 0 0 0 0 0 0 1 0 0 0 1
## SNP13 SNP14 SNP15 SNP16
## 1 1 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 1
## 5 0 1 0 0
## 6 0 0 1 0
train_data <- dat %>%
select(-population)
train_labels <- dat %>%
pull(population)
pca_out <- prcomp(train_data, center = T, scale = T)
pca_scores <- pca_out$x
pca_loadings <- pca_out$rotation
pca_var <- (pca_out$sdev)^2
# plot scores
pca_scores %>%
as.data.frame() %>%
mutate(pop = as.factor(train_labels)) %>%
ggplot(aes(x = PC1, y = PC2, color = pop)) +
geom_point() +
scale_color_brewer(palette = 'Dark2')
# look at loadings for the first PC
pca_loadings[, 'PC1']
## SNP1 SNP2 SNP3 SNP4 SNP5
## 0.5313109898 0.5313109898 0.5313109898 0.2196683364 0.2215912314
## SNP6 SNP7 SNP8 SNP9 SNP10
## 0.2222106280 0.0103846425 -0.0414667899 0.0001811429 -0.0080280080
## SNP11 SNP12 SNP13 SNP14 SNP15
## 0.0367174558 -0.0188089080 0.0064250922 0.0377916553 0.0038589821
## SNP16
## 0.0362399620
## loading plots
plot(pca_loadings[, 'PC1'], ylab = "Loadings for PC1")
plot(pca_loadings[, 'PC2'], ylab = "Loadings for PC2")
plot(pca_loadings[, 'PC3'], ylab = "Loadings for PC3")
plot(pca_loadings[, 'PC4'], ylab = "Loadings for PC4")
# calculate proportion of variance explained
pve <- pca_var/sum(pca_var)
# construct scree plots
plot(pve,
xlab = "Principal Component",
ylab = "Proportion of variance explained",
type = "b")
plot(cumsum(pve),
xlab = "Principal Component",
ylab = "Cumulative proportion of variance explained",
type = "b")
# parallel coordinates plot
pca_scores %>%
as.data.frame() %>%
mutate(population = as.factor(train_labels)) %>%
ggparcoord(columns = 1:14, groupColumn = 'population', alpha = 0.2) +
theme_minimal() +
scale_color_brewer(palette = 'Dark2')
# create dataset with three sets of SNPs
popidx <- rep(1:3, each = 500)
maf1 <- c(0, 0.8, 0.7)
maf2 <- c(0.3, 0.5, 0.6)
maf3 <- c(0.1, 0.1, 0.7)
set.seed(494)
snps1 <- replicate(3, sim_data_onevar(pop = popidx, maf = maf1))
snps2 <- replicate(3, sim_data_onevar(pop = popidx, maf = maf2))
snps3 <- replicate(10, sim_data_onevar(pop = popidx, maf = maf3))
dat <- cbind(popidx, snps1, snps2, snps3) %>%
as.data.frame()
names(dat) <- c('population', paste0('SNP', 1:(ncol(dat)-1)))
head(dat)
## population SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8 SNP9 SNP10 SNP11 SNP12
## 1 1 0 0 0 1 1 0 0 1 1 0 0 1
## 2 1 0 0 0 0 2 1 0 0 0 0 0 1
## 3 1 0 0 0 0 1 2 0 0 0 0 0 1
## 4 1 0 0 0 0 0 1 0 0 0 1 0 0
## 5 1 0 0 0 2 0 0 1 0 0 0 0 1
## 6 1 0 0 0 1 2 0 1 0 0 0 0 0
## SNP13 SNP14 SNP15 SNP16
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
train_data <- dat %>%
select(-population)
train_labels <- dat %>%
pull(population)
pca_out <- prcomp(train_data, center = T, scale = T)
pca_scores <- pca_out$x
pca_loadings <- pca_out$rotation
pca_var <- (pca_out$sdev)^2
# plot scores
pca_scores %>%
as.data.frame() %>%
mutate(pop = as.factor(train_labels)) %>%
ggplot(aes(x = PC1, y = PC2, color = pop)) +
geom_point() +
scale_color_brewer(palette = 'Dark2')
# look at loadings for the first PC
pca_loadings[, c('PC1', 'PC2')]
## PC1 PC2
## SNP1 0.1757481 -0.5057389
## SNP2 0.1848223 -0.5014436
## SNP3 0.1799890 -0.5061206
## SNP4 0.1197950 -0.1999222
## SNP5 0.1494680 -0.1618136
## SNP6 0.1361830 -0.1731812
## SNP7 0.2861616 0.1239207
## SNP8 0.2977443 0.1214676
## SNP9 0.2891597 0.1223471
## SNP10 0.2907584 0.1038532
## SNP11 0.2944655 0.1272194
## SNP12 0.2981784 0.1167594
## SNP13 0.2929242 0.1146459
## SNP14 0.2873783 0.1097595
## SNP15 0.2829128 0.1240317
## SNP16 0.2908182 0.1192597
# calculate proportion of variance explained
pve <- pca_var/sum(pca_var)
# construct scree plots
plot(pve,
xlab = "Principal Component",
ylab = "Proportion of variance explained",
type = "b")
plot(cumsum(pve),
xlab = "Principal Component",
ylab = "Cumulative proportion of variance explained",
type = "b")
# parallel coordinates plot
pca_scores %>%
as.data.frame() %>%
mutate(population = as.factor(train_labels)) %>%
ggparcoord(columns = 1:14, groupColumn = 'population', alpha = 0.2) +
theme_minimal() +
scale_color_brewer(palette = 'Dark2')